#!/usr/bin/Rscript
##########################################################################################
# Social Media and Policy Responses to the COVID-19 Pandemic in Switzerland
##########################################################################################
# Description:
##########################################################################################
# Preparation of SMD Data
##########################################################################################
# Contents
##########################################################################################
# 1) Dependencies
# 2) Preparations
# 3) Set Variable Names Needed
# 4) Load Data 
# 5) Sentiment
# 6) Topic allocation
# 7) Finishing touch
# 8) Save curated data
##########################################################################################
# 1) Dependencies
##########################################################################################
suppressPackageStartupMessages(library(tools))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(quanteda))
suppressPackageStartupMessages(library(quantreg))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(glue))
suppressPackageStartupMessages(library(pbmcapply))
suppressPackageStartupMessages(library(urltools))
suppressPackageStartupMessages(library(cld2))
suppressPackageStartupMessages(library(elastic))
suppressPackageStartupMessages(library(rjson))
suppressPackageStartupMessages(library(jsonlite))
##########################################################################################
# 2) Preparations
##########################################################################################
rm(list=ls())

# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
parent_path <- getwd()

#Elastic Query Function:
getbysmd <- function(x = x, index = "digdemlab", type = "data", Date1 = "", Date2 = ""){
  days <- seq(as.Date(paste0(Date1)), as.Date(paste0(Date2)), by="days")
  for(i in 1:(length(days)-1)){
    if(i ==1){
      out <- NULL
    } else {
    }
    dayx <- as.Date(days[i])
    dayy <- as.Date(days[i+1])
    bodyfun <- paste0("pubDateTime:[",dayx, " TO ", dayy, "]")
    tmp <- NULL
    tmp <- Search(conn = x, index = index, txpe = type, size = 10000, asdf = T,
                  q = paste0(bodyfun))$hits$hits
    if(i %% 50 == 0){
      print(paste0(i))
    } else {
      
    }
    if(length(tmp) == 0){
    } else {
      if(i == 1 | (length(tmp) != 0 & is.null(out) == T) ){
        out <- tmp
      } else {
        out <- dplyr::bind_rows(out, tmp)
      }
    }
    rm(tmp)
  }
  return(out)
}
##########################################################################################
# 3) Get Data From DB
##########################################################################################
###### -------- Define File Names -------- ######
x <- connect(es_host = "localhost", port = 9200)

df_pr_a <- getbysmd(x=x, index="smd", type="data", Date1 = as.Date("2019-11-30"), Date2 = as.Date("2020-08-17"))

colnames(df_pr_a) <- gsub("^.*\\.","", names(df_pr_a))
df_pr_a <- df_pr_a %>% setNames(make.names(names(.), unique = TRUE))
df_pr_a <- df_pr_a %>% dplyr::select(-c("X_index", "X_type", "X_id", "X_score"))
df_pr_a <- df_pr_a %>% distinct(.keep_all = T)

write_rds(df_pr_a, "../data/SMD_data.RDS")
rm(df_pr_a)
##########################################################################################
# 4) Load Data 
##########################################################################################
# - load Data
df_raw <- read_rds(paste0("../data/SMD_data.RDS"))
gc()

# - get idea of scope
df_raw$pubDateTime <-  as.Date(df_raw$pubDateTime)
range(df_raw$pubDateTime)


# Load Function Data:
# Load Dictionary from Proksch et al. Multilingual Sentiment analysis (based on the Lexicoder Dictionary)
load(paste0("../lib/auto_dictionaries_lsd.RData"))
dictionaries <- load(paste0("../lib/auto_dictionaries_lsd.RData"))

# Load improved Lexicons
load(paste0("../lib/lsde_frenche_germane.RData"))
dictionaries_2 <- load(paste0("../lib/lsde_frenche_germane.RData"))

#Combine Vector
dictionaries <- c(dictionaries, dictionaries_2)
rm(dictionaries_2)
gc()
##########################################################################################
# 5) Sentiment
##########################################################################################
# - text Cleaner Function:
text_df <- df_raw$tx
len_l <- length(text_df)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  
  text_tmp_k <- pbmclapply(min_k:max_k, function(l){
    # Get Sentences in order:
    tx_new <- gsub('(?<=[a-z])\\.(?=[A-Z\\(])', '. ', text_df[l], perl = T)
    # Remove URL's
    tx_new <- gsub('http\\S+\\s*', '', tx_new, perl = T)
    # Remove Numbers:
    tx_new <- gsub('http[[:alnum:]]*', '', tx_new, perl = T)
    # Remove leading and trailing whitespaces:
    tx_new <- gsub('^[[:space:]]*', '', tx_new, perl = T)
    tx_new <- gsub('[[:space:]]*$', '', tx_new, perl = T)
    # Remove weired quotation marks and other signs quanteda does not like:
    tx_new <- gsub('|\\»|\\«|\\}|\\{', '', tx_new)
    tx_new <- gsub('\\–', '. ', tx_new)
    tx_new <- gsub('\\{|\\}|\\[|\\]|\\(|\\)', '', tx_new)
    tx_new <- tolower(tx_new)
    
    tx_new <- as.character(tx_new)
  }, mc.cores = 8)
  
  text_tmp_k <- unlist(text_tmp_k)
  if(k == 1){
    text_tmp <- text_tmp_k
  } else {
    text_tmp <- c(text_tmp, text_tmp_k)
  }
  gc() 
}
df_raw$text <- text_tmp
rm(len_l,text_df,text_tmp,min_k,max_k,text_tmp_k,k)

# - filter texts in rm:
df_raw <- df_raw %>% dplyr::filter(la != "rm")
gc()

# Show Language File Count:
df_raw %>% group_by(la) %>% summarise(n = n())

write_rds(df_raw, "../data/SMD_data.RDS")
df_raw <- read_rds(paste0("../data/SMD_data.RDS"))

# Split large Data in smaller chunks:
nr <- nrow(df_raw)
n <- 10000
dflist <- split(df_raw, rep(1:ceiling(nr/n), each=n, length.out=nr))
df_raw <- data.frame()
gc()


#Sentiment Calculator:
final_dat <- function(data, dictionarieslist = dictionaries, quant_nodes = 4, nodes = 4){
  quanteda_options(threads = quant_nodes)
  #########################################################
  # Add Sentiment Column:
  data$sentiment_value <- NA
  data$positive_words <- NA
  data$negative_words <- NA
  cat("Make sure you have loaded the 'auto_dictionaries_lsd.RData' and 'lsde_frenche_germane.RData' in the environment.\nWithout them the function will not work\n")
  # Split data by Languages:
  data_de <- data[data$la == "de", ]
  data_fr <- data[data$la == "fr", ]
  data_it <- data[data$la == "it", ]
  data_en <- data[data$la == "en", ]
  #Clear Mem:
  rm(data)
  gc()
  
  cat("Prepocessing of Articles done! Starting Sentiment Analysis!\n")
  #########################################################
  # Allocate Memory
  numde <- nrow(data_de)
  numfr <- nrow(data_fr)
  numit <- nrow(data_it)
  numen <- nrow(data_en)
  ################
  # Process German
  protex <- corpus(data_de[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_de"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>%
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), sep ="", collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>%
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_de$sentiment_value <- sentisave$value
  data_de$positive_words <-  sentisave$Pos_Words
  data_de$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process French
  protex <- corpus(data_fr[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_fr"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_fr$sentiment_value <- sentisave$value
  data_fr$positive_words <-  sentisave$Pos_Words
  data_fr$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process Italian
  protex <- corpus(data_it[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_it"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_it$sentiment_value <- sentisave$value
  data_it$positive_words <-  sentisave$Pos_Words
  data_it$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process English
  protex <- corpus(data_en[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_en"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    dplyr::mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_en$sentiment_value <- sentisave$value
  data_en$positive_words <-  sentisave$Pos_Words
  data_en$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################
  # Combine all frames again:
  data <- dplyr::bind_rows(data_de,data_fr,data_it,data_en)
  rm(data_de,data_fr,data_it,data_en)
  gc()
  return(data)
}

for(t in 1:length(dflist)){
  tmp <- final_dat(dflist[[t]], quant_nodes = 4, nodes = 4) 
  if(t == 1){
    df_raw <- tmp
    rm(tmp)
  } else {
    df_raw <- rbind(df_raw,tmp)
    rm(tmp)
  }
  print(paste0(t," th Element of list processed!\n"))
  gc()
}

df_raw$sentiment_value <- as.numeric(df_raw$sentiment_value)
rm(dflist)
gc()
saveRDS(df_raw, "../data/SMD_data.RDS")
##########################################################################################
# 6) Topic allocation
##########################################################################################
df_raw <- readRDS("../data/SMD_data.RDS")

# Covid 19 (Yes / No):
search_pattern_covid <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('corona','covid19','coronaschweiz','coronach','coronavirus','coronavirusschweiz','epidemie',
    'social distancing', 'coronatests', 'pandemie', 'corona-pandemie',
    'coronakrise','covid19ch','covidch','bag_ofsp_ufsp','coronainfoch', 'swisscovid',
    'pandemie', 'covid', 'coronakrise','swiss-covid-app', 'coronapandemie',
    'corona-sommer', 'covid-19-erkrankungen', 'corona-kredit', 'corona-infektionen',
    'lockdown', 'schutzmaske','beatmungsgerät', 'beatmungsgeräte','pandémie',
    'masques', 'crise sanitaire', 'covid-19', 'sars-cov-2', 'coronagraben', 'swisscovid',
    'coronavirus', 'covid', 'epidémie', 'social distancing', 'garder ses distances',
    'maske','contact tracing', 'masquer', 'maschera', 'respirator', 'hygienemaske', 'ffp2', 
    'atemschutz', 'swisscovid','covidioten','neuinfektionen','hospitalisierungsrate',
    'covidapp','coronaapp','swiss-covid-app', 'contact-tracing-app','dp-3t','swisscovidapp',
    'epidemiologisch','antikörper', 'maskenpflicht','maskenzwang','maskenwahn','herdenimunität',
    'coronawarnapp', 'contact-tracing', 'contact tracing', 'besondere lage', 'ausserordendliche lage',
    'swisscovid-app', 'corona-app','covid-codes','corona app', 'corona warn app',' contact tracing app', 
    'kontakt verfolgungs app', 'kontakt rückverfolgung'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$tx
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covid))
  },mc.cores = 8)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovid19_txt <- text_tmp
df_raw$topic <- ifelse(df_raw$iscovid19_txt == 1, "COVID19", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/SMD_data.RDS")

# Mask Topic:
search_pattern_mask <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('schutzmaske', 'masques', 'maske','masquer', 'maschera', 'hygienemaske', 'ffp2', 'mascherina', 'masken', 'atemschutzmasken',
    'atemschutz', 'maskenpflicht','maskenzwang','maskenwahn', 'mundnasenschutz', 'mund-nasen-schutz', 'gesichtsschutz', 'masque'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$tx
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_mask))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$ismask_txt <- text_tmp
df_raw$topic_1 <- ifelse(df_raw$ismask_txt == 1, "Mask", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/SMD_data.RDS")

# CovidApp Topic:
search_pattern_covidapp <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app',
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$tx
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovidapp_txt <- text_tmp
df_raw$topic_2 <- ifelse(df_raw$iscovidapp_txt == 1, "CovidApp", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/SMD_data.RDS")

search_pattern_covidapp_2 <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app', 
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

search_pattern_covidapp_2 <- paste(search_pattern_covidapp_2, "|\\bapp\\b|\\bapps\\b")

df_txt_a <- df_raw$tx
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp_2))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovidapp_txt_2 <- text_tmp
df_raw$topic_2_2 <- ifelse(df_raw$iscovid19_txt == 0, "Anderes", 
                           ifelse(df_raw$iscovidapp_txt_2 == 1, "CovidApp", "Anderes"))
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/SMD_data.RDS")

df_raw$topic_0 <- df_raw$topic

df_raw$topic <- ifelse(df_raw$ismask_txt == 1 & df_raw$topic_2_2 == "CovidApp", "App & Masks", 
                       ifelse(df_raw$topic_2_2 == "CovidApp", "App", 
                              ifelse(df_raw$ismask_txt == 1, "Masks",
                                     ifelse(df_raw$iscovid19_txt == 1, "Covid19", "Anderes"))))

df_raw %>% group_by(topic) %>% summarise(n = n())
##########################################################################################
# 7) Finishing touch
##########################################################################################
names(df_raw)
##########################################################################################
# 8) Save curated data
##########################################################################################
saveRDS(df_raw, "../data/SMD_data.RDS")
rm(df_raw, extendeddict_bg, extendeddict_cs, extendeddict_da, extendeddict_el, 
   extendeddict_es, extendeddict_et, extendeddict_fi, extendeddict_hu, extendeddict_lt,
   extendeddict_lv, extendeddict_nl, extendeddict_pl, extendeddict_pt, extendeddict_ro,
   extendeddict_sk, extendeddict_sl, extendeddict_sv)
gc()
